{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# TPOT tutorial on the Titanic dataset "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The Titanic machine learning competition on [Kaggle](https://www.kaggle.com/c/titanic) is one of the most popular beginner's competitions on the platform. We will use that competition here to demonstrate the implementation of TPOT. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "from tpot import TPOTClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pandas as pd \n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the data\n",
    "titanic = pd.read_csv('data/titanic_train.csv')\n",
    "titanic.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Data Exploration "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sex     Survived\n",
       "female  1           233\n",
       "        0            81\n",
       "male    0           468\n",
       "        1           109\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic.groupby('Sex').Survived.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pclass  Sex     Survived\n",
       "1       female  1            91\n",
       "                0             3\n",
       "        male    0            77\n",
       "                1            45\n",
       "2       female  1            70\n",
       "                0             6\n",
       "        male    0            91\n",
       "                1            17\n",
       "3       female  0            72\n",
       "                1            72\n",
       "        male    0           300\n",
       "                1            47\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic.groupby(['Pclass','Sex']).Survived.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>0.0</th>\n",
       "      <th>1.0</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
       "      <th>female</th>\n",
       "      <td>0.031915</td>\n",
       "      <td>0.968085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>male</th>\n",
       "      <td>0.631148</td>\n",
       "      <td>0.368852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">2</th>\n",
       "      <th>female</th>\n",
       "      <td>0.078947</td>\n",
       "      <td>0.921053</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>male</th>\n",
       "      <td>0.842593</td>\n",
       "      <td>0.157407</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">3</th>\n",
       "      <th>female</th>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>male</th>\n",
       "      <td>0.864553</td>\n",
       "      <td>0.135447</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Survived            0.0       1.0\n",
       "Pclass Sex                       \n",
       "1      female  0.031915  0.968085\n",
       "       male    0.631148  0.368852\n",
       "2      female  0.078947  0.921053\n",
       "       male    0.842593  0.157407\n",
       "3      female  0.500000  0.500000\n",
       "       male    0.864553  0.135447"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "id = pd.crosstab([titanic.Pclass, titanic.Sex], titanic.Survived.astype(float))\n",
    "id.div(id.sum(1).astype(float), 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Data Munging "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The first and most important step in using TPOT on any data set is to rename the target class/response variable to `class`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "titanic.rename(columns={'Survived': 'class'}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "At present, TPOT requires all the data to be in numerical format. As we can see below, our data set has 5 categorical variables which contain non-numerical values: `Name`, `Sex`, `Ticket`, `Cabin` and `Embarked`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId      int64\n",
       "class            int64\n",
       "Pclass           int64\n",
       "Name            object\n",
       "Sex             object\n",
       "Age            float64\n",
       "SibSp            int64\n",
       "Parch            int64\n",
       "Ticket          object\n",
       "Fare           float64\n",
       "Cabin           object\n",
       "Embarked        object\n",
       "dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "We then check the number of levels that each of the five categorical variables have. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of levels in category 'Name': \b 891.00 \n",
      "Number of levels in category 'Sex': \b 2.00 \n",
      "Number of levels in category 'Ticket': \b 681.00 \n",
      "Number of levels in category 'Cabin': \b 148.00 \n",
      "Number of levels in category 'Embarked': \b 4.00 \n"
     ]
    }
   ],
   "source": [
    "for cat in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']:\n",
    "    print(\"Number of levels in category '{0}': \\b {1:2.2f} \".format(cat, titanic[cat].unique().size))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "As we can see, `Sex` and `Embarked` have few levels. Let's find out what they are."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Levels for catgeory 'Sex': ['male' 'female']\n",
      "Levels for catgeory 'Embarked': ['S' 'C' 'Q' nan]\n"
     ]
    }
   ],
   "source": [
    "for cat in ['Sex', 'Embarked']:\n",
    "    print(\"Levels for catgeory '{0}': {1}\".format(cat, titanic[cat].unique()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "We then code these levels manually into numerical values. For `nan` i.e. the missing values, we simply replace them with a placeholder value (-999). In fact, we perform this replacement for the entire data set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "titanic['Sex'] = titanic['Sex'].map({'male':0,'female':1})\n",
    "titanic['Embarked'] = titanic['Embarked'].map({'S':0,'C':1,'Q':2})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId    False\n",
       "class          False\n",
       "Pclass         False\n",
       "Name           False\n",
       "Sex            False\n",
       "Age            False\n",
       "SibSp          False\n",
       "Parch          False\n",
       "Ticket         False\n",
       "Fare           False\n",
       "Cabin          False\n",
       "Embarked       False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic = titanic.fillna(-999)\n",
    "pd.isnull(titanic).any()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Since `Name` and `Ticket` have so many levels, we drop them from our analysis for the sake of simplicity. For `Cabin`, we encode the levels as digits using Scikit-learn's [`MultiLabelBinarizer`](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html) and treat them as new features. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "mlb = MultiLabelBinarizer()\n",
    "CabinTrans = mlb.fit_transform([{str(val)} for val in titanic['Cabin'].values])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [1, 0, 0, ..., 0, 0, 0],\n",
       "       ..., \n",
       "       [1, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [1, 0, 0, ..., 0, 0, 0]])"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CabinTrans"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Drop the unused features from the dataset. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "titanic_new = titanic.drop(['Name','Ticket','Cabin','class'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "assert (len(titanic['Cabin'].unique()) == len(mlb.classes_)), \"Not Equal\" #check correct encoding done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "We then add the encoded features to form the final dataset to be used with TPOT. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "titanic_new = np.hstack((titanic_new.values,CabinTrans))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.isnan(titanic_new).any()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Keeping in mind that the final dataset is in the form of a numpy array, we can check the number of features in the final dataset as follows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "156"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic_new[0].size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Finally we store the class labels, which we need to predict, in a separate variable. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "titanic_class = titanic['class'].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Data Analysis using TPOT"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "To begin our analysis, we need to divide our training data into training and validation sets. The validation set is just to give us an idea of the test set error. The model selection and tuning is entirely taken care of by TPOT, so if we want to, we can skip creating this validation set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(668, 223)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_indices, validation_indices = training_indices, testing_indices = train_test_split(titanic.index, stratify = titanic_class, train_size=0.75, test_size=0.25)\n",
    "training_indices.size, validation_indices.size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "After that, we proceed to calling the `fit`, `score` and `export` functions on our training dataset. To get a better idea of how these functions work, refer the TPOT documentation [here](http://epistasislab.github.io/tpot/api/).\n",
    "\n",
    "An important TPOT parameter to set is the number of generations. Since our aim is to just illustrate the use of TPOT, we have set maximum optimization time to 2 minutes (max_time_mins=2). On a standard laptop with 4GB RAM, it roughly takes 5 minutes per generation to run. For each added generation, it should take 5 mins more. Thus, for the default value of 100, total run time could be roughly around 8 hours.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: xgboost.XGBClassifier is not available and will not be used by TPOT.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 80/80 [00:12<00:00,  6.43pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 1 - Current best internal CV score: 0.8203972618112445\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 120/120 [00:19<00:00,  5.50pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 2 - Current best internal CV score: 0.8203972618112445\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 160/160 [00:32<00:00,  2.99pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 3 - Current best internal CV score: 0.8203972618112445\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 200/200 [00:43<00:00,  6.08pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 4 - Current best internal CV score: 0.8203972618112445\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 240/240 [00:52<00:00,  4.20pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 5 - Current best internal CV score: 0.8203972618112445\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 280/280 [01:03<00:00,  3.62pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 6 - Current best internal CV score: 0.8219341887055946\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 320/320 [01:15<00:00,  3.21pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 7 - Current best internal CV score: 0.8249418737556994\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 360/360 [01:26<00:00,  3.95pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 8 - Current best internal CV score: 0.8249418737556994\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 400/400 [01:38<00:00,  4.84pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 9 - Current best internal CV score: 0.8249418737556994\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 440/440 [01:48<00:00,  3.66pipeline/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 10 - Current best internal CV score: 0.8249418737556994\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 100%|██████████| 480/480 [02:51<00:00,  2.06s/pipeline]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 11 - Current best internal CV score: 0.8249418737556994\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           9,  2.05s/pipeline]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "TPOT closed prematurely. Will use the current best pipeline.\n",
      "\n",
      "Best pipeline: RandomForestClassifier(input_matrix, RandomForestClassifier__bootstrap=False, RandomForestClassifier__criterion=DEFAULT, RandomForestClassifier__max_features=0.4, RandomForestClassifier__min_samples_leaf=1, RandomForestClassifier__min_samples_split=9)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    }
   ],
   "source": [
    "tpot = TPOTClassifier(verbosity=2, max_time_mins=2, max_eval_time_mins=0.04, population_size=40)\n",
    "tpot.fit(titanic_new[training_indices], titanic_class[training_indices])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8340807174887892"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices, 'class'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tpot.export('tpot_titanic_pipeline.py')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Let's have a look at the generated code. As we can see, the random forest classifier performed the best on the given dataset out of all the other models that TPOT currently evaluates on. If we ran TPOT for more generations, then the score should improve further."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# %load tpot_titanic_pipeline.py\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n",
    "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n",
    "features = tpot_data.drop('target', axis=1)\n",
    "training_features, testing_features, training_classes, testing_classes = \\\n",
    "            train_test_split(features, tpot_data['target'], random_state=None)\n",
    "\n",
    "exported_pipeline = RandomForestClassifier(bootstrap=False, max_features=0.4, min_samples_leaf=1, min_samples_split=9)\n",
    "\n",
    "exported_pipeline.fit(training_features, training_classes)\n",
    "results = exported_pipeline.predict(testing_features)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Make predictions on the submission data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>418.000000</td>\n",
       "      <td>418.000000</td>\n",
       "      <td>332.000000</td>\n",
       "      <td>418.000000</td>\n",
       "      <td>418.000000</td>\n",
       "      <td>417.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>1100.500000</td>\n",
       "      <td>2.265550</td>\n",
       "      <td>30.272590</td>\n",
       "      <td>0.447368</td>\n",
       "      <td>0.392344</td>\n",
       "      <td>35.627188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>120.810458</td>\n",
       "      <td>0.841838</td>\n",
       "      <td>14.181209</td>\n",
       "      <td>0.896760</td>\n",
       "      <td>0.981429</td>\n",
       "      <td>55.907576</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>892.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.170000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>996.250000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>7.895800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>1100.500000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>27.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>14.454200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1204.750000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>39.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>31.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1309.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>76.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>512.329200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       PassengerId      Pclass         Age       SibSp       Parch        Fare\n",
       "count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000\n",
       "mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188\n",
       "std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576\n",
       "min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000\n",
       "25%     996.250000    1.000000   21.000000    0.000000    0.000000    7.895800\n",
       "50%    1100.500000    3.000000   27.000000    0.000000    0.000000   14.454200\n",
       "75%    1204.750000    3.000000   39.000000    1.000000    0.000000   31.500000\n",
       "max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read in the submission dataset\n",
    "titanic_sub = pd.read_csv('data/titanic_test.csv')\n",
    "titanic_sub.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The most important step here is to check for new levels in the categorical variables of the submission dataset that are absent in the training set. We identify them and set them to our placeholder value of '-999', i.e., we treat them as missing values. This ensures training consistency, as otherwise the model does not know what to do with the new levels in  the submission dataset. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "for var in ['Cabin']: #,'Name','Ticket']:\n",
    "    new = list(set(titanic_sub[var]) - set(titanic[var]))\n",
    "    titanic_sub.ix[titanic_sub[var].isin(new), var] = -999"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "We then carry out the data munging steps as done earlier for the training dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "titanic_sub['Sex'] = titanic_sub['Sex'].map({'male':0,'female':1})\n",
    "titanic_sub['Embarked'] = titanic_sub['Embarked'].map({'S':0,'C':1,'Q':2})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId    False\n",
       "Pclass         False\n",
       "Name           False\n",
       "Sex            False\n",
       "Age            False\n",
       "SibSp          False\n",
       "Parch          False\n",
       "Ticket         False\n",
       "Fare           False\n",
       "Cabin          False\n",
       "Embarked       False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "titanic_sub = titanic_sub.fillna(-999)\n",
    "pd.isnull(titanic_sub).any()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "While calling `MultiLabelBinarizer` for the submission data set, we first fit on the training set again to learn the levels and then transform the submission dataset values. This further ensures that only those levels that were present in the training dataset are transformed. If new levels are still found in the submission dataset then it will return an error and we need to go back and check our earlier step of replacing new levels with the placeholder value. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "mlb = MultiLabelBinarizer()\n",
    "SubCabinTrans = mlb.fit([{str(val)} for val in titanic['Cabin'].values]).transform([{str(val)} for val in titanic_sub['Cabin'].values])\n",
    "titanic_sub = titanic_sub.drop(['Name','Ticket','Cabin'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Form the new submission data set\n",
    "titanic_sub_new = np.hstack((titanic_sub.values,SubCabinTrans))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.any(np.isnan(titanic_sub_new))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Ensure equal number of features in both the final training and submission dataset\n",
    "assert (titanic_new.shape[1] == titanic_sub_new.shape[1]), \"Not Equal\" "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Generate the predictions\n",
    "submission = tpot.predict(titanic_sub_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Create the submission file\n",
    "final = pd.DataFrame({'PassengerId': titanic_sub['PassengerId'], 'Survived': submission})\n",
    "final.to_csv('data/submission.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(418, 2)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "There we go! We have successfully generated the predictions for the 418 data points in the submission dataset, and we're good to go ahead to submit these predictions on Kaggle. "
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
